map <-
  read_csv("data/HydroWASTE_v10.csv")
## Rows: 58502 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): WWTP_NAME, COUNTRY, CNTRY_ISO, STATUS, LEVEL
## dbl (20): WASTE_ID, SOURCE, ORG_ID, LAT_WWTP, LON_WWTP, QUAL_LOC, LAT_OUT, L...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(map) %>% 
  knitr::kable(digits = 3) %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), font_size = 12) %>% 
  kableExtra::scroll_box(width = "100%", height = "300px")
WASTE_ID SOURCE ORG_ID WWTP_NAME COUNTRY CNTRY_ISO LAT_WWTP LON_WWTP QUAL_LOC LAT_OUT LON_OUT STATUS POP_SERVED QUAL_POP WASTE_DIS QUAL_WASTE LEVEL QUAL_LEVEL DF HYRIV_ID RIVER_DIS COAST_10KM COAST_50KM DESIGN_CAP QUAL_CAP
1 1 1140441 Akmenes aglomeracija Lithuania LTU 56.247 22.726 2 56.223 22.627 Not Reported 1060 2 148.213 4 Advanced 1 2421.974 20228874 4.153 0 0 4600 2
2 1 1140443 Alytaus m aglomeracija Lithuania LTU 54.432 24.056 2 54.519 24.098 Not Reported 87900 2 8797.904 1 Advanced 1 2534.527 20261585 257.983 0 0 220000 2
3 1 1140445 Anyksciu aglomeracija Lithuania LTU 55.509 25.073 2 55.452 25.006 Not Reported 12400 2 1959.285 1 Advanced 1 1367.809 20243105 30.995 0 0 33000 2
4 1 1140447 Ariogalos aglomeracija Lithuania LTU 55.252 23.484 2 55.210 23.510 Not Reported 2500 2 578.482 1 Secondary 1 2061.969 20247446 13.799 0 0 4357 2
5 1 1140449 Baisogalos aglomeracija Lithuania LTU 55.644 23.741 2 55.681 23.835 Not Reported 1200 2 167.788 4 Secondary 1 209.549 20239330 0.405 0 0 1490 2
6 1 1140451 Birstono Prienu aglomeracija Lithuania LTU 54.623 24.062 2 54.715 24.094 Not Reported 12400 2 2239.471 1 Advanced 1 10366.240 20256987 268.665 0 0 19000 2
wwtp <-
  read_csv("data/SARS-CoV-2_concentrations_measured_in_NYC_Wastewater_20231129.csv")
## Rows: 4270 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Sample Date, Test date, WRRF Name, WRRF Abbreviation, Annotation, T...
## num (3): Concentration SARS-CoV-2 gene target (N1 Copies/L), Per capita SARS...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(wwtp) %>% 
  knitr::kable(digits = 3) %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), font_size = 12) %>% 
  kableExtra::scroll_box(width = "100%", height = "300px")
Sample Date Test date WRRF Name WRRF Abbreviation Concentration SARS-CoV-2 gene target (N1 Copies/L) Per capita SARS-CoV-2 load (N1 copies per day per population) Annotation Population Served, estimated Technology
08/31/2020 09/01/2020 26th Ward 26W 389 263535.6 Concentration below Method Limit of Quantification (above Method Limit of Detection) 290608 RT-qPCR
08/31/2020 09/01/2020 Bowery Bay BB 1204 443632.9 NA 924695 RT-qPCR
08/31/2020 09/01/2020 Coney Island CI 304 168551.6 Concentration below Method Limit of Quantification (above Method Limit of Detection) 682342 RT-qPCR
08/31/2020 09/01/2020 Hunts Point HP 940 574446.6 NA 755948 RT-qPCR
08/31/2020 09/01/2020 Jamaica Bay JA 632 233077.7 NA 748737 RT-qPCR
08/31/2020 09/01/2020 Newtown Creek NC 197 122396.8 Concentration below Method Limit of Quantification (above Method Limit of Detection) 1156473 RT-qPCR
map_df <-
  map |>
  janitor::clean_names() |>
  select(waste_id, wwtp_name, lat_wwtp, lon_wwtp) |>
  mutate(wrrf_abbreviation = recode( wwtp_name,
    "New York C   Rockaway WPCP" = "RK",
    "New York C   Red Hook WPCP" = "RH",
    "New York C   Port Richmond WPCP" = "PR",
    "New York C   Oakwood Beach WPCP" = "OB",
    "New York C   26th Ward WPCP" = "26W",
    "New York C   Tallman Island WPCP" = "TI",
    "New York C   North River WPCP" = "NR",
    "New York C   Coney Island WPCP" = "CI",
    "New York C   Jamaica WPCP" = "JA",
    "New York C   Hunts Point WPCP" = "HP",
    "New York C   Owls Head WPCP" = "OH",
    "New York C   Bowery Bay WPCP" = "BB",
    "New York C   Newtown Creek WPCP" = "NC",
    "New York C   Wards Island WPCP" = "WI",
  ))
wwtp_df <-
  wwtp |>
  janitor:: clean_names()
merge_df <- inner_join(map_df, wwtp_df, by = "wrrf_abbreviation")
nyc_wastewater <-
  merge_df |>
  rename(concentration = concentration_sars_co_v_2_gene_target_n1_copies_l) |>
  drop_na(concentration) |>
  separate(sample_date, into = c("month", "day", "year"), convert = TRUE) %>% 
   mutate(
     year = as.character(year),
     month = factor(month, levels = 1:12),
     month = recode(month,
                        "1" = "January",
                        "2" = "February",
                        "3" = "March",
                        "4" = "April",
                        "5" = "May",
                        "6" = "June",
                        "7" = "July",
                        "8" = "August",
                        "9" = "September",
                        "10" = "October",
                        "11" = "November",
                        "12" = "December")) |>
  select(-waste_id, -test_date, -per_capita_sars_co_v_2_load_n1_copies_per_day_per_population, -population_served_estimated, -wwtp_name, -annotation, -wrrf_abbreviation)

head(nyc_wastewater) %>% 
  knitr::kable(digits = 3) %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), font_size = 12) %>% 
  kableExtra::scroll_box(width = "100%", height = "300px")
lat_wwtp lon_wwtp month day year wrrf_name concentration technology
40.788 -73.923 August 31 2020 Wards Island 1339 RT-qPCR
40.788 -73.923 September 8 2020 Wards Island 1065 RT-qPCR
40.788 -73.923 September 13 2020 Wards Island 394 RT-qPCR
40.788 -73.923 September 15 2020 Wards Island 316 RT-qPCR
40.788 -73.923 September 20 2020 Wards Island 1250 RT-qPCR
40.788 -73.923 October 6 2020 Wards Island 1152 RT-qPCR

Creating the dataset for an overall trend for Covid-19 concentration across the 14 waste water facilities within New York State from 2020 to 2023.

overall_trend <-
  nyc_wastewater |>
  filter(technology == "RT-qPCR") |>
  select(-lat_wwtp, -lon_wwtp) |>
  group_by(year, month) |>
  summarise(avg_conc = mean(concentration)) |>
  unite("year_month",year, month, sep = "_") |>
  mutate(month_avg = ymd(paste0(year_month, "_01")))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

Plotting Scatterplot

overall_trend |>
  ggplot(aes(x = month_avg, y = avg_conc)) + 
  geom_point(size = 3, alpha = .7) +
  geom_smooth(size = 2, se = FALSE, alpha = .3) +
    labs(
      x = "Time", 
      y = "Monthly average Covid-19 concentration in waste water in NY (N/L)") + 
  theme(axis.line = element_line(color = "grey"), 
      panel.background = element_blank(), 
      legend.position = "none", 
      panel.grid.major = element_line(color = "light grey", linetype = "dashed"),
      plot.title = element_text(hjust = 0.5)) + 
  ggtitle("Monthly average Covid-19 concentration in waste water in NY measured from 2020 to 2023")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Sub dataset

rtqpcr <-
  nyc_wastewater |>
  filter(year %in% c(2021, 2022), technology == "RT-qPCR") |>
  select(-lat_wwtp, -lon_wwtp)

dpcr <-
  nyc_wastewater |>
  filter(year %in% c(2021, 2022), technology == "dPCR") |>
  select(-lat_wwtp, -lon_wwtp)

dpcr is not large enough to do data analysis.

data analysis

spaghetti_plot_2021 <-
  rtqpcr |>
  filter(year == 2021) |>
  group_by(month, wrrf_name) |>
  summarise(avg_conc = mean(concentration)) |>
  ggplot(aes(x = month, y = avg_conc, color = wrrf_name, group = wrrf_name)) +
  geom_line(alpha = .5) +
  geom_point(alpha = .5) +
  labs(x = "Month", 
       y = "Average Covid-19 Concentration of each waste water facility", 
       title = "Average Covid-19 Concentration of each waste water facility in 2021 in New York State") + 
  theme(legend.position = "bottom",
        legend.box.background = element_rect())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
spaghetti_plot_2021

trend_plot_2022

spaghetti_plot_2022 <-
  rtqpcr |>
  filter(year == 2022) |>
  group_by(month, wrrf_name) |>
  summarise(avg_conc = mean(concentration)) |>
  ggplot(aes(x = month, y = avg_conc, color = wrrf_name, group = wrrf_name)) +
  geom_line(alpha = .5) +
  geom_point(alpha = .5) +
  labs(x = "Month", 
       y = "Average Covid-19 Concentration of each waste water facility", 
       title = "Average Covid-19 Concentration of each waste water facility in 2022 in New York State") + 
  theme(legend.position = "bottom",
        legend.box.background = element_rect())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
spaghetti_plot_2022

tables 2021

box_plot_2021 <- rtqpcr %>%
  filter(year == 2021) %>%
  group_by(month, wrrf_name) %>%
  summarise(avg_conc = mean(concentration)) %>%
  plot_ly(
    x = ~avg_conc,
    y = ~wrrf_name,
    type = "box",
    color = ~wrrf_name,
    colors = "viridis"
  ) %>%
  layout(
    xaxis = list(title = "Mean concentration (N/L)"),
    yaxis = list(title = "Area"),
    showlegend = FALSE
  )
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
box_plot_2021

tables 2022

box_plot_2022 <- rtqpcr %>%
  filter(year == 2022) %>%
  group_by(month, wrrf_name) %>%
  summarise(avg_conc = mean(concentration)) %>%
  plot_ly(
    x = ~avg_conc,
    y = ~wrrf_name,
    type = "box",
    color = ~wrrf_name,
    colors = "viridis"
  ) %>%
  layout(
    xaxis = list(title = "Mean concentration (N/L)"),
    yaxis = list(title = "Area"),
    showlegend = FALSE
  )
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
box_plot_2021

##Questions 1 The concentration of covid for each station on average 2 literature review foe two diff tech (RT-qPCR, dPCR) 3 Year 2021 and 2022